In this notebook, we run quick DE volcano plots, p-value histograms and pairwise cross plots for our conditions.
Code
source("../../scripts/R/de_functions.R")source("../../scripts/R/go_kegg_functions.R")suppressPackageStartupMessages({library(DESeq2)library(GenomicFeatures)library(RColorBrewer)library(gplots)library(biomaRt)library(grid)library(gridExtra)library(ggplot2)library(lattice)library(reshape)library(geneplotter)library(ggrepel)library(limma)library(tidyverse)library(eulerr)library(gghighlight)library(clusterProfiler) # install from githublibrary(org.Dm.eg.db)})
1.1 Functions
Code
## If the output directory doesn't exist, create itif(!dir.exists("../output/02_DE")){dir.create("../output/02_DE")}output_dir <-"../output/02_DE/"
# Set control as reference#colData(dds)$condition <- relevel(colData(dds)$condition, ref = "WT")# Removing lowly expressed genes, only to be done once at the start of the differential expression stepfilter =apply(counts(dds, normalized =TRUE), 1, function(x){ mean(x) >=10 })dds = dds[filter, ]dim(dds)
[1] 11145 20
Run DESeq:
Code
dds <-DESeq(dds, test ="Wald", parallel =TRUE)
using pre-existing size factors
estimating dispersions
gene-wise dispersion estimates: 6 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 6 workers
### Note the order of W4_OE and CTRL - W4_OE here is the numerator, CTRL is the denominatorresults_W4OE_CTRL =get_dds_res(wald_dds, contrast =c("condition", "W4_OE", "Ctrl"), ensembl.genes, shrink =TRUE)
[1] "condition_W4_OE_vs_Ctrl"
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
res = results_W4OE_CTRL hist(res$pvalue, xlab ="p-value", ylab ="Frequency")
Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):
df_res =as.data.frame(res)# Drop rows with NAs in pvalue - these are genes with high variabilitydf_res = df_res[complete.cases(df_res[ ,c("pvalue", "padj")]), ]
ego <-plotEGO_dm(down_genes, universe =rownames(dds), ont ="BP",title ="GO, downregulated genes")
Running GO for organism = drosophila melanogaster
[1] "4 enrichments found"
Code
ego
ID
Description
GeneRatio
BgRatio
pvalue
p.adjust
qvalue
geneID
Count
GO:0044242
cellular lipid catabolic process
5/55
86/8880
0.0001802
0.0654606
0.0596368
Pepck2/CG7900/CG15533/CG15534/CG3699
5
GO:0046395
carboxylic acid catabolic process
5/55
106/8880
0.0004774
0.0654606
0.0596368
cn/Oat/Pepck2/CG7900/CG3699
5
GO:0016054
organic acid catabolic process
5/55
107/8880
0.0004984
0.0654606
0.0596368
cn/Oat/Pepck2/CG7900/CG3699
5
GO:0016042
lipid catabolic process
5/55
117/8880
0.0007496
0.0738387
0.0672695
Pepck2/CG7900/CG15533/CG15534/CG3699
5
Code
# Get the entrez IDsup_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% up_genes, ]$entrezgene_id)up_ekegg <-plotKEGG_dm(up_entrez, title ="KEGG, upregulated genes")
Running KEGG for organism = drosophila melanogaster
# Get the entrez IDsdown_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% down_genes, ]$entrezgene_id)down_ekegg <-plotKEGG_dm(down_entrez, title ="KEGG, downregulated genes")
Running KEGG for organism = drosophila melanogaster
### Note the order of W4_OE and CTRL - W4_OE here is the numerator, CTRL is the denominatorresults_O1OE_CTRL =get_dds_res(wald_dds, contrast =c("condition", "O1_OE", "Ctrl"), ensembl.genes, shrink =TRUE)
[1] "condition_O1_OE_vs_Ctrl"
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
res = results_O1OE_CTRL hist(res$pvalue, xlab ="p-value", ylab ="Frequency")
Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):
df_res =as.data.frame(res)# Drop rows with NAs in pvalue - these are genes with high variabilitydf_res = df_res[complete.cases(df_res[ ,c("pvalue", "padj")]), ]
# Get the entrez IDsup_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% up_genes, ]$entrezgene_id)up_ekegg <-plotKEGG_dm(up_entrez, title ="KEGG, upregulated genes")
Running KEGG for organism = drosophila melanogaster
[1] "6 enrichments found"
Code
custom_ekegg(up_ekegg, interesting_pathways = up_ekegg$Description,title ="KEGG, up, O1OE vs Ctrl")
# Get the entrez IDsdown_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% down_genes, ]$entrezgene_id)down_ekegg <-plotKEGG_dm(down_entrez, title ="KEGG, downregulated genes")
Running KEGG for organism = drosophila melanogaster
[1] "16 enrichments found"
Code
down_ekegg
ID
Description
GeneRatio
BgRatio
pvalue
p.adjust
qvalue
geneID
Count
dme03030
dme03030
DNA replication - Drosophila melanogaster (fruit fly)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
res = results_O1W4OE_CTRL hist(res$pvalue, xlab ="p-value", ylab ="Frequency")
Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):
df_res =as.data.frame(res)# Drop rows with NAs in pvalue - these are genes with high variabilitydf_res = df_res[complete.cases(df_res[ ,c("pvalue", "padj")]), ]
# Get the entrez IDsup_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% up_genes, ]$entrezgene_id)up_ekegg <-plotKEGG_dm(up_entrez, title ="KEGG, upregulated genes")
Running KEGG for organism = drosophila melanogaster
# Get the entrez IDsdown_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% down_genes, ]$entrezgene_id)down_ekegg <-plotKEGG_dm(down_entrez, title ="KEGG, downregulated genes")
Running KEGG for organism = drosophila melanogaster
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
res = results_O2KO_CTRL hist(res$pvalue, xlab ="p-value", ylab ="Frequency")
Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):
df_res =as.data.frame(res)# Drop rows with NAs in pvalue - these are genes with high variabilitydf_res = df_res[complete.cases(df_res[ ,c("pvalue", "padj")]), ]
# Get the entrez IDsup_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% up_genes, ]$entrezgene_id)up_ekegg <-plotKEGG_dm(up_entrez, title ="KEGG, upregulated genes")
Running KEGG for organism = drosophila melanogaster
# Get the entrez IDsdown_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% down_genes, ]$entrezgene_id)down_ekegg <-plotKEGG_dm(down_entrez, title ="KEGG, downregulated genes")
Running KEGG for organism = drosophila melanogaster
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
res = results_W4KO_CTRL hist(res$pvalue, xlab ="p-value", ylab ="Frequency")
Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):
df_res =as.data.frame(res)# Drop rows with NAs in pvalue - these are genes with high variabilitydf_res = df_res[complete.cases(df_res[ ,c("pvalue", "padj")]), ]
# Get the entrez IDsup_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% up_genes, ]$entrezgene_id)up_ekegg <-plotKEGG_dm(up_entrez, title ="KEGG, upregulated genes")
Running KEGG for organism = drosophila melanogaster
# Get the entrez IDsdown_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% down_genes, ]$entrezgene_id)down_ekegg <-plotKEGG_dm(down_entrez, title ="KEGG, downregulated genes")
Running KEGG for organism = drosophila melanogaster
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
res = results_O1KO_CTRL hist(res$pvalue, xlab ="p-value", ylab ="Frequency")
Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):
df_res =as.data.frame(res)# Drop rows with NAs in pvalue - these are genes with high variabilitydf_res = df_res[complete.cases(df_res[ ,c("pvalue", "padj")]), ]
# Get the entrez IDsup_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% up_genes, ]$entrezgene_id)up_ekegg <-plotKEGG_dm(up_entrez, title ="KEGG, upregulated genes")
Running KEGG for organism = drosophila melanogaster
# Get the entrez IDsdown_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% down_genes, ]$entrezgene_id)down_ekegg <-plotKEGG_dm(down_entrez, title ="KEGG, downregulated genes")
Running KEGG for organism = drosophila melanogaster
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
res = results_O1W4KO_CTRL hist(res$pvalue, xlab ="p-value", ylab ="Frequency")
Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):
df_res =as.data.frame(res)# Drop rows with NAs in pvalue - these are genes with high variabilitydf_res = df_res[complete.cases(df_res[ ,c("pvalue", "padj")]), ]
# Get the entrez IDsup_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% up_genes, ]$entrezgene_id)up_ekegg <-plotKEGG_dm(up_entrez, title ="KEGG, upregulated genes")
Running KEGG for organism = drosophila melanogaster
# Get the entrez IDsdown_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% down_genes, ]$entrezgene_id)down_ekegg <-plotKEGG_dm(down_entrez, title ="KEGG, downregulated genes")
Running KEGG for organism = drosophila melanogaster
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
res = results_O2W4KO_CTRL hist(res$pvalue, xlab ="p-value", ylab ="Frequency")
Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):
df_res =as.data.frame(res)# Drop rows with NAs in pvalue - these are genes with high variabilitydf_res = df_res[complete.cases(df_res[ ,c("pvalue", "padj")]), ]
# Get the entrez IDsup_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% up_genes, ]$entrezgene_id)up_ekegg <-plotKEGG_dm(up_entrez, title ="KEGG, upregulated genes")
Running KEGG for organism = drosophila melanogaster
# Get the entrez IDsdown_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% down_genes, ]$entrezgene_id)down_ekegg <-plotKEGG_dm(down_entrez, title ="KEGG, downregulated genes")
Running KEGG for organism = drosophila melanogaster
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
res = results_O1O2KO_CTRL hist(res$pvalue, xlab ="p-value", ylab ="Frequency")
Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):
df_res =as.data.frame(res)# Drop rows with NAs in pvalue - these are genes with high variabilitydf_res = df_res[complete.cases(df_res[ ,c("pvalue", "padj")]), ]
# Get the entrez IDsup_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% up_genes, ]$entrezgene_id)up_ekegg <-plotKEGG_dm(up_entrez, title ="KEGG, upregulated genes")
Running KEGG for organism = drosophila melanogaster
# Get the entrez IDsdown_entrez <-na.omit(ensembl.genes[ensembl.genes$gene_id %in% down_genes, ]$entrezgene_id)down_ekegg <-plotKEGG_dm(down_entrez, title ="KEGG, downregulated genes")
Running KEGG for organism = drosophila melanogaster